Image('AdaStumpForest.png')
We first need to understand the difference between weak and strong learners in the context of ML:
Step One:
Step Two:
Step Three:
Estimate the accuracy of the random forest:
Step Four:
Select most accurate random forest based upon Out-of-Bag error.
Important Notes:
Image('rForstMax.png')
Image('AdaStumpForest.png')
Step One:
Step Two:
In classification tasks some of our explanatory variables may be in the form of continuous variables. Therefore, we have three steps to follow to caluclate the gini index:
$$$$
Image('AdaRoot.png')
We need to determine how much say this stump has in our final decision based on how well it classified our samples.
We can see in our right leaf our stump made 1 error.
The total error for a stump is the sum of the weights associated with the incorrectly classified samples
where:
$\hat{y}_j^{(i)}$: y is the $j^{th}$ predictor’s prediction for the $i^{th}$ instance.
Each instance weight $w^{(i)}$ is initially set to $\frac{1}{m}$
where:
Image('AmountSay.png')
Image('ChestPainAda.png')
Image('ChestSay.png')
I have chosen to resample with our new weights:
Image('SklearnAdaVis.png')
Libraries:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_regression
import matplotlib.lines as mlines
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.tree import DecisionTreeClassifier
import seaborn as sns
import pandas as pd
import scipy
from IPython.display import display
from IPython.display import Image
from sklearn.datasets import make_regression
from sklearn import tree
import graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from mlxtend.plotting import plot_decision_regions
from sklearn.ensemble import AdaBoostClassifier
Full SIzed RandomForest:
np.random.seed(40)
# creating our dataset
df1 = pd.DataFrame({'Chest_Pain': np.random.choice(['1', '0'],p = [0.8,0.2], size=20),"Weight": np.random.randint(81,110,20), 'High_colesterol': np.random.choice(['1', '0'],p = [0.8,0.2], size=20),
'Strong_Blood_circulation': np.random.choice(['1', '0'],p = [0.8,0.2], size=20), "Heart_disease":np.random.choice(['1', '0'], p = [0.8,0.2], size=20)})
df2 = pd.DataFrame({'Chest_Pain': np.random.choice(['1', '0'],p = [0.2,0.8], size=20),"Weight": np.random.randint(40,80,20), 'High_colesterol': np.random.choice(['1', '0'],p = [0.3,0.7], size=20),
'Strong_Blood_circulation': np.random.choice(['1', '0'],p = [0.2,0.8], size=20), "Heart_disease":np.random.choice(['1', '0'], p = [0.2,0.8], size=20)})
rf = pd.concat([df1,df2],axis=0)
rf = rf.reset_index(drop=True)
X = rf.drop('Heart_disease',axis=1).values
y = rf['Heart_disease'].values
fn=rf.columns
cn=['Yes','No']
clf = RandomForestClassifier()
clf = clf.fit(X,y)
fig, axes = plt.subplots(nrows = 1,ncols = 4,figsize = [12,5], dpi=900)
for index in range(0, 4):
tree.plot_tree(clf.estimators_[index],
filled = True,
feature_names = fn,
class_names=cn,
ax = axes[index]);
axes[index].set_title('Estimator: ' + str(index), fontsize = 11)
fig.savefig('rForstMax.png')
Visualizing a Forest of Stumps:
np.random.seed(40)
# creating our dataset
df1 = pd.DataFrame({'Chest_Pain': np.random.choice(['1', '0'],p = [0.8,0.2], size=20),"Weight": np.random.randint(81,110,20), 'High_colesterol': np.random.choice(['1', '0'],p = [0.8,0.2], size=20),
'Strong_Blood_circulation': np.random.choice(['1', '0'],p = [0.8,0.2], size=20), "Heart_disease":np.random.choice(['1', '0'], p = [0.8,0.2], size=20)})
df2 = pd.DataFrame({'Chest_Pain': np.random.choice(['1', '0'],p = [0.2,0.8], size=20),"Weight": np.random.randint(40,80,20), 'High_colesterol': np.random.choice(['1', '0'],p = [0.3,0.7], size=20),
'Strong_Blood_circulation': np.random.choice(['1', '0'],p = [0.2,0.8], size=20), "Heart_disease":np.random.choice(['1', '0'], p = [0.2,0.8], size=20)})
rf = pd.concat([df1,df2],axis=0)
rf = rf.reset_index(drop=True)
# Sampling with replacement
bootstrapped = rf.sample(replace = True, n =40)
X = bootstrapped.drop('Heart_disease',axis=1).values
y = bootstrapped['Heart_disease'].values
fn=bootstrapped.columns
cn=['Yes','No']
clf = RandomForestClassifier(max_depth = 1)
clf = clf.fit(X,y)
fig, axes = plt.subplots(nrows = 1,ncols = 4,figsize = [12,5], dpi=900)
for index in range(0, 4):
tree.plot_tree(clf.estimators_[index],
filled = True,
feature_names = fn,
class_names=cn,
ax = axes[index]);
axes[index].set_title('Estimator: ' + str(index), fontsize = 11)
fig.savefig('AdaStumpForest.png')
AdaRoot:
np.random.seed(45)
ada = rf.sample(replace = False, n =10)
ada['Sample_Weight'] = 1/len(ada)
X1 = ada.drop(['Heart_disease','Sample_Weight'],axis=1).values
y1 = ada['Heart_disease'].values
fn=ada.columns[:-2]
cn=['NO','YES']
tree_classifier = DecisionTreeClassifier(max_depth = 1)
tree_classifier = tree_classifier.fit(X1,y1)
dot_data = tree.export_graphviz(tree_classifier, out_file=None,
feature_names= fn,
class_names=cn,
filled=True, rounded=True,
special_characters=True)
p = graphviz.Source(dot_data)
p.format = 'png'
p.filename = 'AdaRoot'
p.render()
Chest Pain Ada:
X = ada['Chest_Pain'].values.reshape(-1,1)
y = ada['Heart_disease'].values
fn= np.array([ada.columns[0]])
cn=['Yes','No']
tree_classifier = DecisionTreeClassifier(max_depth = 1)
tree_classifier = tree_classifier.fit(X,y)
dot_data = tree.export_graphviz(tree_classifier, out_file=None,
feature_names= fn,
class_names=cn,
filled=True, rounded=True,
special_characters=True)
z = graphviz.Source(dot_data)
z.format = 'png'
z.filename = 'ChestPainAda'
z.render()
calculating amount of say:
weighted_sum = 0.1*4
inncorrect_patients = (1-weighted_sum)/weighted_sum
error = 1/2*np.log(inncorrect_patients)
a = np.linspace(0.00001,0.99999,300)
total_error = []
for x in a:
patients = (1-a)/a
total_error.append(1/2*np.log(patients))
#Plotting
fig,ax1 = plt.subplots(figsize=(9,9))
ax1.plot(a,total_error[0], linewidth = 6)
ax1.scatter(0.4,error,c ='#FF4500',edgecolor = 'black',s = 100)
ax1.annotate('', xy=(0.4, error), xytext=(0.4, -6),
arrowprops=dict(facecolor='black', shrink=0.02))
ax1.annotate('', xy=(0, error+0.06), xytext=(0.39, error -0.03),
arrowprops=dict(facecolor='black', shrink=0.02))
# Remove top and right spines
ax1.spines['top'].set_visible(False)
ax1.spines['right'].set_visible(False)
ax1.text(0.35, 2, 'Amount of Say = $ \\frac{1}{2}log(\\frac{1-0.4}{0.4})$', fontsize = 15, weight='bold')
ax1.text(0.35, 1, '$\\alpha_j=0.2$', fontsize = 15, weight='bold')
# Providing subplots with titles
ax1.title.set_text('Amount of Say')
fig.savefig('ChestSay.png')
calculating amount of say:
weighted_sum = 0.1*1
inncorrect_patients = (1-weighted_sum)/weighted_sum
error = 1/2*np.log(inncorrect_patients)
error
a = np.linspace(0.00001,0.99999,300)
total_error = []
for x in a:
patients = (1-a)/a
total_error.append(1/2*np.log(patients))
#Plotting
fig,ax1 = plt.subplots(figsize=(9,9))
ax1.plot(a,total_error[0], linewidth = 6)
ax1.scatter(0.1,error,c ='#FF4500',edgecolor = 'black',s = 100)
ax1.annotate('', xy=(0.1, error), xytext=(0.1, -6),
arrowprops=dict(facecolor='black', shrink=0.02))
ax1.annotate('', xy=(-0.03, error), xytext=(0.08, error),
arrowprops=dict(facecolor='black', shrink=0.02))
# Remove top and right spines
ax1.spines['top'].set_visible(False)
ax1.spines['right'].set_visible(False)
ax1.text(0.35, 2, 'Amount of Say = $ \\frac{1}{2}log(9)$', fontsize = 15, weight='bold')
ax1.text(0.35, 1, '$\\alpha_j=1.098$', fontsize = 15, weight='bold')
# Providing subplots with titles
ax1.title.set_text('Amount of Say')
Sklearn AdaVisulaization:
np.random.seed(40)
# creating our dataset
df1 = pd.DataFrame({'Chest_Pain': np.random.choice(['1', '0'],p = [0.8,0.2], size=20),"Weight": np.random.randint(81,110,20), 'High_colesterol': np.random.choice(['1', '0'],p = [0.8,0.2], size=20),
'Strong_Blood_circulation': np.random.choice(['1', '0'],p = [0.8,0.2], size=20), "Heart_disease":np.random.choice(['1', '0'], p = [0.8,0.2], size=20)})
df2 = pd.DataFrame({'Chest_Pain': np.random.choice(['1', '0'],p = [0.2,0.8], size=20),"Weight": np.random.randint(40,80,20), 'High_colesterol': np.random.choice(['1', '0'],p = [0.3,0.7], size=20),
'Strong_Blood_circulation': np.random.choice(['1', '0'],p = [0.2,0.8], size=20), "Heart_disease":np.random.choice(['1', '0'], p = [0.2,0.8], size=20)})
rf = pd.concat([df1,df2],axis=0)
rf = rf.reset_index(drop=True)
np.random.seed(45)
ada = rf.sample(replace = False, n =10)
#ada['Sample_Weight'] = 1/len(ada)
X1 = ada.drop(['Heart_disease',],axis=1).values
y1 = ada['Heart_disease'].values
ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
n_estimators=200,
algorithm="SAMME.R",
learning_rate=0.5)
ada_clf.fit(X1, y1)
fn=ada.columns[:-2]
cn=['Yes','No']
fig, axes = plt.subplots(nrows = 1,ncols = 4,figsize = [12,5], dpi=900)
for index in range(0, 4):
tree.plot_tree(ada_clf.estimators_[index],
filled = True,rounded = True,precision=3,
feature_names = fn,
class_names=cn,
ax = axes[index]);
axes[index].set_title('Estimator: ' + str(index), fontsize = 11)
fig.savefig('SklearnAdaVis.png')